library(tidyverse)
library(tidyplus)
library(vroom)
library(tidygeocoder)
library(lubridate)

lat_delta20 = 20/69 # approximate lat/long values for 20 miles 
long_delta20 = 20/54.6 

set.seed(0000)

# geocoding jobs ---------------------------------------------------
geocoded_jobs <- job_data %>% # job listing dataset with locations
  select(url, location) %>%
  distinct() %>% 
  geocode(location, method = "arcgis") %>% 
  reverse_geocode(lat, long, method = "arcgis") %>% 
  mutate(zip = str_extract(address, " \\d{5}, USA") %>% str_extract("\\d{5}"),
         address = location) %>% 
  select(-location)%>% 
  mutate(lat_lb = lat - lat_delta20, 
         lat_ub = lat + lat_delta20, 
         long_lb = long - long_delta20, 
         long_ub = long + long_delta20) %>% 
  distinct()%>% 
    mutate(rev_geo = map2(lat, long, \(lat, long){
        reverse_geo(lat = lat, long = long, method = "arcgis") %>%
            select(address) %>% 
            unlist()
    }) %>% unlist()) %>%
    select(address, rev_geo, everything()) %>%
    mutate(job_state = str_extract(rev_geo, ", [A-Z]{2}, \\d{5}") %>% 
               str_extract("[A-Z]{2}"),
           job_city = str_extract(rev_geo, "[A-z ]{1,}, [A-z ]{1,}, \\d{5}") %>% 
               str_extract("[A-z ]{1,}")) %>% 
  mutate(ind = row_number())


# matching home address ---------------------------------------------------
nad_formatted <- read_csv(# NAD dataset)

get_address <- function(job_city, job_state,
                        lat_lb, lat_ub, long_lb, long_ub, id){
  
  job_city <- job_city %>% str_to_lower()
  
  locs <- nad_formatted %>%
    filter(between(lat, lat_lb, lat_ub),
           between(long, long_lb, long_ub)) %>% 
    mutate(ind = id, 
           city1 = city %>% str_to_lower(),
           dist = case_when(city1 == job_city & 
                              state == job_state ~ 0,
                            city1 != job_city & 
                              state == job_state ~ 1,
                            city1 != job_city & 
                              state != job_state ~ 2))  %>% 
    distinct() %>% 
    arrange(dist) %>% 
    head(200) %>% 
    slice_sample(n = 4, replace = F)
  
  if(nrow(locs) < 4){ # 30 miles each direction
    locs <- nad_formatted %>%
      filter(between(lat, lat_lb - lat_delta20/2, lat_ub + lat_delta20/2),
             between(long, long_lb - long_delta20/2, long_ub + long_delta20/2)) %>% 
      mutate(ind = id, 
             city1 = city %>% str_to_lower(),
             dist = case_when(city1 == job_city & 
                                state == job_state ~ 0,
                              city1 != job_city & 
                                state == job_state ~ 1,
                              city1 != job_city & 
                                state != job_state ~ 2)) %>% 
      distinct() %>% 
      bind_rows(locs)  %>% 
      arrange(dist) %>% 
      head(200) %>% 
      slice_sample(n = 4, replace = F)
  }
  
  return(locs)
}

home <- geocoded_jobs %>% 
  left_join(
    geocoded_jobs %>% 
      pmap(\(job_city, job_state, lat_lb, lat_ub, long_lb, long_ub, ind,...) {
        get_address(job_city, job_state, lat_lb, lat_ub, long_lb, long_ub, ind)
      }) %>% 
      reduce(bind_rows) %>% 
      rename_with(\(x){str_c("residential_", x)}) %>% 
      rename("ind" = "residential_ind"),
    by = "ind"
  ) 

# matching high schools ---------------------------------------------------
highschools <-  read_csv(# high school dataset) %>% 
  select(state = MSTATE, name = SCH_NAME, city = MCITY, 
         contains("LSTREET"), zip = LZIP, GSHI, SCH_TYPE, CHARTER_TEXT) %>% 
  rename_with(str_to_lower) %>% 
  filter(gshi == 12, 
         sch_type %in% c(1,3),
         !str_detect(name %>% tolower(), "online"),
         !str_detect(name %>% tolower(), "arts"),
         !str_detect(name %>% tolower(), "prep"),
         !str_detect(name %>% tolower(), "cyber"),
         !str_detect(name %>% tolower(), "virtual"),
         charter_text != "Yes") %>% 
   mutate(across(c(name, city),~str_to_title(.))) %>% 
   mutate(name = str_replace(name, "H S", "High School"),
          name = str_replace(name, "Hs", "High School"),
          name = str_replace(name, "Sch[^o]", "School "),
          name = str_replace(name, "Sch$", "School "),
          name = str_replace(name, "Prog", "Program"),
          name = str_replace(name, "Ctr.", "Center"),
          name = str_replace(name, "Shs$", "High School"),
          name = str_remove_all(name, "\\([A-z0-9]{1,}\\)"),
          name = str_remove_all(name, "^"),
          name = str_remove_all(name, "#"),
          name = str_remove_all(name, "&")) %>% 
  transmute(name, state, city, zip, 
            address = str_c(lstreet1, " ", city, " ", state))

get_highschool <- function(job_city, job_state,
                           job_zip, id){
  job_city <- job_city %>% str_to_lower()
  
  hs <- highschools %>% 
    filter(between(zip %>% as.numeric(), 
                   job_zip %>% as.numeric() - 100, 
                   job_zip %>% as.numeric() + 100)) %>% 
    rename_with(\(x){str_c("highschool_", x)}) %>% 
    mutate(ind = id, 
           city1 = highschool_city %>% str_to_lower(),
           dist = case_when(city1 == job_city & 
                              highschool_state == job_state ~ 0,
                            city1 != job_city & 
                              highschool_state == job_state ~ 1,
                            city1 != job_city & 
                              highschool_state != job_state ~ 2)) %>%
      distinct() %>% 
      arrange(dist) %>% 
      head(200) %>% 
    slice_sample(n = 4, replace = F)
  return(hs)
}

hs <- geocoded_jobs %>% 
  left_join(
    geocoded_jobs %>% 
      pmap(\(job_city, job_state, zip, ind,...) {
        get_highschool(job_city, job_state, zip, ind)
      }) %>% 
      reduce(bind_rows),
    by = "ind"
  ) 

# matching prior employer -------------------------------------------------
ig_merge <- read_csv(# business dataset)


get_prior_emp <- function(job_company, job_city, job_state,
                          lat_lb, lat_ub, long_lb, long_ub, id){
  
  job_city <- job_city %>% str_to_lower()
  
  prior <- ig_merge %>% 
    mutate(hiring_company = str_remove_all(hiring_company, "'") %>% str_to_lower()) %>% 
    filter(!str_detect(company, job_company)) %>% 
    filter(between(lat, lat_lb, lat_ub),
           between(long, long_lb, long_ub)) %>% 
    mutate(ind = id, 
           city1 = city %>% str_to_lower(),
           dist = case_when(city1 == job_city & 
                              state == job_state ~ 0,
                            city1 != job_city & 
                              state == job_state ~ 1,
                            city1 != job_city & 
                              state != job_state ~ 2)) %>% 
    distinct() %>% 
    arrange(dist) %>% 
    head(200) %>% 
    slice_sample(n = 4, replace = F)
  
  if(nrow(prior) < 4){
    prior <- ig_merge %>% 
      filter(!str_detect(company, job_company)) %>% 
      filter(between(lat, lat_lb - lat_delta20/2, lat_ub + lat_delta20/2),
             between(long, long_lb - long_delta20/2, long_ub + long_delta20/2)) %>% 
      mutate(ind = id, 
             city1 = city %>% str_to_lower(),
             dist = case_when(city1 == job_city & 
                                state == job_state ~ 0,
                              city1 != job_city & 
                                state == job_state ~ 1,
                              city1 != job_city & 
                                state != job_state ~ 2)) %>% 
      distinct() %>% 
      bind_rows(prior) %>% 
      arrange(dist) %>% 
      head(200) %>% 
      slice_sample(n = 4, replace = F)
  }
  return(prior)
}

prior <- geocoded_jobs %>% 
  left_join(
    geocoded_jobs %>% 
      pmap(\(company, job_city, job_state, lat_lb, lat_ub, long_lb, long_ub, ind,...) {
        get_prior_emp(company, job_city, job_state, lat_lb, lat_ub, long_lb, long_ub, ind)
      }) %>% 
      reduce(bind_rows) %>% 
      rename_with(\(x){str_c("prior_", x)}) %>% 
      rename("ind" = "prior_ind"),
    by = "ind"
  ) 

# combining matches -------------------------------------------------------
jobs_apps_combined <- geocoded_jobs %>% 
  transmute(url, address, company, zip, ind, job_city, job_state) %>% 
  left_join(home %>% 
              select(-contains("lat"),
                     -contains("long")), 
            by = c("url", "address", "company", "zip", "ind", "job_city", "job_state")) %>% 
  group_by(url) %>% 
  mutate(row_id = row_number()) %>% 
  ungroup() %>% 
  full_join(hs %>% 
              select(-contains("lat"),
                     -contains("long")) %>% 
              group_by(url) %>% 
              mutate(row_id = row_number()) %>% 
              ungroup(), 
            by = c("url", "address", "company", "zip", "ind", "job_city", "job_state", "row_id")) %>% 
  full_join(prior %>% 
              select(-contains("lat"),
                     -contains("long"))%>% 
              group_by(url) %>% 
              mutate(row_id = row_number()) %>% 
              ungroup(), 
            by = c("url", "address", "company", "zip", "ind", "job_city", "job_state", "row_id")) %>% 
  filter(!is.na(prior_address)) %>% 
  filter(!is.na(residential_address1)) %>% 
  filter(!is.na(highschool_address)) %>% 
  select(-contains("city1"), 
         -contains("rev_geo")) %>% 
  mutate(across(contains("address"), ~str_replace_all(., "#", "No.")),
         across(-url, ~str_replace_all(., "&", "and"))) 

  
# prepare CV -----------------------------------------------------------------

# merge names 

names <- read_csv("names.csv") %>% 
  pivot_longer(cols = everything(),
               names_to = "type",
               values_to = "name") %>% 
  mutate(race = if_else(str_detect(type, "Black"), "b", "w"),
         is_female = if_else(str_extract(type, "\\d$") %>% as.numeric() > 5,
                             T, F)) %>% 
  select(-type)

surnames <- read_csv("surnames.csv") %>% 
  pivot_longer(cols = everything(),
               names_to = "type",
               values_to = "surname") %>% 
  mutate(race = if_else(str_detect(type, "black"), "b", "w")) %>% 
  select(-type)


black <- expand_grid(names %>% 
                       filter(race == "b") %>% 
                       select(name) %>% 
                       unlist(),
                     surnames %>% 
                       filter(race == "b") %>% 
                       select(surname) %>% 
                       unlist()) %>% 
  rename_with(.fn = \(x){str_extract(x, "[A-z]{1,}s") %>% str_sub(end = -2)}) %>% 
  left_join(names,
            by = "name") %>%
  slice_sample(n = 988) %>%  # randomizing order
  group_by(is_female) %>% 
  mutate(name_id = sample(1:494, 494, replace = F))

white <- expand_grid(names %>% 
                       filter(race == "w") %>% 
                       select(name) %>% 
                       unlist(),
                     surnames %>% 
                       filter(race == "w") %>% 
                       select(surname) %>% 
                       unlist()) %>% 
  rename_with(.fn = \(x){str_extract(x, "[A-z]{1,}s") %>% str_sub(end = -2)}) %>% 
  left_join(names,
            by = "name") %>%
  slice_sample(n = 988) %>%  # randomizing order
  group_by(is_female) %>% 
  mutate(name_id = sample(1:494, 494, replace = F))

# phone number and email: example for 20 applicants

# email address
set.seed(99)
phone <- phone_number_data %>% # dataset of phone number purchased
  transmute(phone1 = str_extract(phone, "\\d{3}"),
            phone2 = str_extract(phone, "\\d{3}-") %>% str_extract("\\d{3}"),
            phone3 = str_extract(phone, "\\d{4}$")) %>% 
  mutate(rand = runif(20),
         phone = if_else(
           rand > 0.5,
           str_c(phone1, " ", phone2, " ", phone3),
           str_c("(", phone1, ") ", phone2, "-", phone3)
         )
  )

name_contact_finished <- bind_rows(
  black %>% 
    group_by(is_female) %>% 
    slice_sample(n =5) %>% 
    ungroup(),
  white %>% 
    group_by(is_female) %>% 
    slice_sample(n = 5) %>% 
    ungroup()
)  %>% 
  select(name, surname) %>% 
  distinct() %>% 
  mutate(domain = sample(c("@mailprofessional.live", "@voyagemail.pro"), 20, replace = T),
         rand = runif(20),
         usrsty = case_when(
           between(rand, 0, 0.25)~ str_c(name, surname) %>% str_to_lower(),
           between(rand, 0.25, 0.5)~ str_c(surname, name) %>% str_to_lower(),
           between(rand, 0.5, 0.75)~ str_c(name %>% str_extract("\\w"), surname) %>% str_to_lower(),
           between(rand, 0.75, 1)~ str_c(surname, name %>% str_extract("\\w")) %>% str_to_lower()
         ),
         rand2 = runif(20),
         usrsty = if_else(rand2 > 0.7, usrsty, str_c(usrsty, sample(1:999, 20)))
  ) %>% 
  bind_cols(phone %>% 
              select(phone)) %>% 
  select(-contains("rand"))



# treatment assignment
unique_names <- name_contact_finished %>% 
  inner_join(bind_rows(black, white),
             by = c("name","surname")) %>% 
  select(-name_id) %>% 
  mutate(email = str_c(usrsty, domain)) %>% 
  group_by(race, is_female) %>% 
  mutate(rand = sample(1:5, 5, replace = F), # assign treatment
         exp = if_else(rand > 2.5, T, F)) %>% 
  ungroup() 


# matching jobs and applicants ---------------------------------------------------------------
# this function is applied to every job, which creates four applicant roles specified by race and experience.
# we then randomly map applicants to these roles making sure no duplicate applicants were applying for the same job.
treatment_assign <- function(job_id){
  assign <- tibble(race = c("b","b","w","w"),
                   exp = c(T, F, T, F)) %>% 
    mutate(temp = sample(1:4, 4)) %>%
    group_by(exp) %>% 
    mutate(exp_set = sample(1:2, 2)) %>% 
    ungroup() %>% 
    mutate(exp_set = if_else(!exp, -999, exp_set),
           work = if_else(!exp, 0, runif(n=4, 0.5, 2) %>% round(1)), # assuming prior employment lasts 0.5 - 2 years
           hs_age = runif(n = 4, 17, 19) %>% round(), # assuming graduating high school at 17 - 19
           current_age = if_else(!exp,
                                 hs_age,
                                 (hs_age + work) %>% round()
           ),
           dob = make_date(year(Sys.Date()) - current_age - 1, 1, 1) + # assuming currently unemployed
             duration(sample(0:364, n()), unit = "days"),
           hs_grad = make_date(year(dob) + hs_age, 1, 1) +
             duration(sample(170:220, n()), unit = "days"),
           hs_start = make_date(year(hs_grad) - 3, 1, 1) +
             duration(sample(210:250, n()), unit = "days"),
           emp_days = work * 365,
           unemp = (days(Sys.Date() - hs_grad) - days(emp_days %>% round())) %>% 
             as.character() %>% 
             str_extract("\\d{1,}") %>% 
             as.numeric(),
           unemp_allo1 = runif(n = 4, 0 ,1),
           emp_start = if_else(!exp, 
                               NA,
                               hs_grad + days((unemp * unemp_allo1) %>% round())),
           emp_end = if_else(!exp, 
                             NA,
                             emp_start + days(emp_days %>% round()))
    ) %>% 
    transmute(race, exp, temp, exp_set, dob,
              hs_grad, hs_start, emp_start, emp_end, ind = job_id) %>% 
    group_by(race, exp) %>% 
    mutate(rand = runif(n()),
           is_female = if_else(rand > 0.5, T, F)) %>% 
    ungroup()
  
  return(assign)
  
}

# example matching
jobs_apps_combined  %>% 
  group_by(ind) %>% 
  mutate(app_id=row_number()) %>% 
  ungroup() %>% 
  inner_join(
    jobs_apps_combined %>% 
      select(ind) %>% 
      distinct() %>% 
      unlist() %>% 
      map(\(x){treatment_assign(x)}) %>% 
      reduce(bind_rows) %>% 
      group_by(ind) %>% 
      mutate(app_id=row_number()) %>% 
      ungroup(),
    by = c("ind", "app_id")
  )

# we match applicants to these jobs based on race and experience; we exclude 
# applicants who applied to the same company previously so each applicant only
# applied to each company at most once.